{
 "cells": [
 {
      "metadata": {},
      "cell_type": "markdown",
      "source": [
        "Copyright 2019 Google LLC\n",
        "\n",
        "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "you may not use this file except in compliance with the License.\n",
        "You may obtain a copy of the License at\n",
        "\n",
        "    https://www.apache.org/licenses/LICENSE-2.0\n",
        "\n",
        "Unless required by applicable law or agreed to in writing, software\n",
        "distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "See the License for the specific language governing permissions and\n",
        "limitations under the License."
      ]
    },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generating the cell lines data\n",
    "\n",
    "We download the data from https://github.com/LuyiTian/sc_mixology/tree/master/data/csv\n",
    "and copy it to the `adenocarcinoma` folder (where the notebook is launched)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading required package: SummarizedExperiment\n",
      "Loading required package: GenomicRanges\n",
      "Loading required package: stats4\n",
      "Loading required package: BiocGenerics\n",
      "Loading required package: parallel\n",
      "\n",
      "Attaching package: 'BiocGenerics'\n",
      "\n",
      "The following objects are masked from 'package:parallel':\n",
      "\n",
      "    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n",
      "    clusterExport, clusterMap, parApply, parCapply, parLapply,\n",
      "    parLapplyLB, parRapply, parSapply, parSapplyLB\n",
      "\n",
      "The following objects are masked from 'package:stats':\n",
      "\n",
      "    IQR, mad, sd, var, xtabs\n",
      "\n",
      "The following objects are masked from 'package:base':\n",
      "\n",
      "    Filter, Find, Map, Position, Reduce, anyDuplicated, append,\n",
      "    as.data.frame, basename, cbind, colMeans, colSums, colnames,\n",
      "    dirname, do.call, duplicated, eval, evalq, get, grep, grepl,\n",
      "    intersect, is.unsorted, lapply, lengths, mapply, match, mget,\n",
      "    order, paste, pmax, pmax.int, pmin, pmin.int, rank, rbind,\n",
      "    rowMeans, rowSums, rownames, sapply, setdiff, sort, table, tapply,\n",
      "    union, unique, unsplit, which, which.max, which.min\n",
      "\n",
      "Loading required package: S4Vectors\n",
      "\n",
      "Attaching package: 'S4Vectors'\n",
      "\n",
      "The following object is masked from 'package:base':\n",
      "\n",
      "    expand.grid\n",
      "\n",
      "Loading required package: IRanges\n",
      "Loading required package: GenomeInfoDb\n",
      "Loading required package: Biobase\n",
      "Welcome to Bioconductor\n",
      "\n",
      "    Vignettes contain introductory material; view with\n",
      "    'browseVignettes()'. To cite Bioconductor, see\n",
      "    'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n",
      "\n",
      "Loading required package: DelayedArray\n",
      "Loading required package: matrixStats\n",
      "\n",
      "Attaching package: 'matrixStats'\n",
      "\n",
      "The following objects are masked from 'package:Biobase':\n",
      "\n",
      "    anyMissing, rowMedians\n",
      "\n",
      "Loading required package: BiocParallel\n",
      "\n",
      "Attaching package: 'DelayedArray'\n",
      "\n",
      "The following objects are masked from 'package:matrixStats':\n",
      "\n",
      "    colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n",
      "\n",
      "The following objects are masked from 'package:base':\n",
      "\n",
      "    aperm, apply\n",
      "\n",
      "Loading required package: ggplot2\n",
      "\n",
      "Attaching package: 'scater'\n",
      "\n",
      "The following object is masked from 'package:S4Vectors':\n",
      "\n",
      "    rename\n",
      "\n",
      "The following object is masked from 'package:stats':\n",
      "\n",
      "    filter\n",
      "\n"
     ]
    }
   ],
   "source": [
    "library(SingleCellExperiment)\n",
    "library(scran)\n",
    "library(scater)\n",
    "library(Seurat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "run_qc <- function(sce) {\n",
    "  isSpike(sce, \"ERCC\") <- grepl(\"^ERCC\", rownames(sce))\n",
    "  sce <- calculateQCMetrics(sce)\n",
    "\n",
    "  # Identify outliers, but without using the mouse as a batch\n",
    "  libsize.drop <- isOutlier(sce$total_counts, nmads=3, type=\"lower\", log=TRUE)\n",
    "  feature.drop <- isOutlier(sce$total_features_by_counts, nmads=3, type=\"lower\", log=TRUE)\n",
    "  spike.drop <- isOutlier(sce$pct_counts_ERCC, nmads=3, type=\"higher\")\n",
    "  keep <- !(libsize.drop | feature.drop | spike.drop)\n",
    "  sce <- sce[,keep]\n",
    "\n",
    "  num.cells <- nexprs(sce, byrow=TRUE)\n",
    "  to.keep <- num.cells > 0\n",
    "  print(sum(!to.keep))\n",
    "  sce <- sce[to.keep,]\n",
    "  sce\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] 0\n"
     ]
    }
   ],
   "source": [
    "tissue = 'sc_celseq2_5cl_p1'\n",
    "path <- paste('adenocarcinoma', tissue, sep='/')\n",
    "counts <- read.csv(paste(path, 'count', 'csv', sep='.'))\n",
    "metadata <- read.csv(paste(path, 'metadata', 'csv', sep='.'))\n",
    "sce_p1 <- SingleCellExperiment(assays = list(counts = as.matrix(counts)),\n",
    "                            colData = as.data.frame(metadata))\n",
    "\n",
    "tissue = 'sc_celseq2_5cl_p2'\n",
    "path <- paste('adenocarcinoma', tissue, sep='/')\n",
    "counts <- read.csv(paste(path, 'count', 'csv', sep='.'))\n",
    "metadata <- read.csv(paste(path, 'metadata', 'csv', sep='.'))\n",
    "sce_p2 <- SingleCellExperiment(assays = list(counts = as.matrix(counts)),\n",
    "                            colData = as.data.frame(metadata))\n",
    "\n",
    "tissue = 'sc_celseq2_5cl_p3'\n",
    "path <- paste('adenocarcinoma', tissue, sep='/')\n",
    "counts <- read.csv(paste(path, 'count', 'csv', sep='.'))\n",
    "metadata <- read.csv(paste(path, 'metadata', 'csv', sep='.'))\n",
    "sce_p3 <- SingleCellExperiment(assays = list(counts = as.matrix(counts)),\n",
    "                            colData = as.data.frame(metadata))\n",
    "\n",
    "universe <- intersect(intersect(rownames(sce_p1), rownames(sce_p2)), rownames(sce_p3))\n",
    "sce_p1 <- sce_p1[universe,]\n",
    "sce_p2 <- sce_p2[universe,]\n",
    "sce_p3 <- sce_p3[universe,]\n",
    "\n",
    "colnames(sce_p2) <- sub('p1', 'p2', colnames(sce_p2))\n",
    "colnames(sce_p3) <- sub('p1', 'p3', colnames(sce_p3))\n",
    "\n",
    "sce_celseq2_5cl <- cbind(sce_p1, sce_p2, sce_p3)\n",
    "sce_celseq2_5cl <- run_qc(sce_celseq2_5cl)\n",
    "\n",
    "colData(sce_celseq2_5cl)$label <- colData(sce_celseq2_5cl)$cell_line_demuxlet\n",
    "saveRDS(sce_celseq2_5cl, 'adenocarcinoma/sce/sc_celseq2_5cl.rds')\n",
    "name = 'sc_celseq2_5cl'\n",
    "write.csv(as.matrix(counts(sce_celseq2_5cl)), paste(name, 'counts.csv', sep='.'))\n",
    "write.csv(colData(sce_celseq2_5cl), paste(name, 'metadata.csv', sep='.'))\n",
    "write.csv(rowData(sce_celseq2_5cl), paste(name, 'featuredata.csv', sep='.'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "tissue = 'sc_celseq2'\n",
    "path <- paste('adenocarcinoma', tissue, sep='/')\n",
    "counts <- read.csv(paste(path, 'count', 'csv', sep='.'))\n",
    "metadata <- read.csv(paste(path, 'metadata', 'csv', sep='.'))\n",
    "sce_celseq2 <- SingleCellExperiment(assays = list(counts = as.matrix(counts)),\n",
    "                            colData = as.data.frame(metadata))\n",
    "sce_celseq2$label <- sce_celseq2$cell_line_demuxlet\n",
    "saveRDS(sce_celseq2, 'adenocarcinoma/sce/sc_celseq2.rds')\n",
    "name = 'sc_celseq2'\n",
    "write.csv(as.matrix(counts(sce_celseq2)), paste(name, 'counts.csv', sep='.'))\n",
    "write.csv(colData(sce_celseq2), paste(name, 'metadata.csv', sep='.'))\n",
    "write.csv(rowData(sce_celseq2), paste(name, 'featuredata.csv', sep='.'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "class: SingleCellExperiment \n",
       "dim: 12653 895 \n",
       "metadata(0):\n",
       "assays(1): counts\n",
       "rownames(12653): ENSG00000154529 ENSG00000215375 ... ENSG00000137413\n",
       "  ENSG00000167157\n",
       "rowData names(8): is_feature_control is_feature_control_ERCC ...\n",
       "  total_counts log10_total_counts\n",
       "colnames(895): p1_A1 p1_A10 ... p3_P8 p3_P9\n",
       "colData names(54): unaligned aligned_unmapped ...\n",
       "  pct_counts_in_top_500_features_ERCC label\n",
       "reducedDimNames(0):\n",
       "spikeNames(1): ERCC"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sce_celseq2_5cl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "tissue = 'sc_10x'\n",
    "path <- paste('adenocarcinoma', tissue, sep='/')\n",
    "counts <- read.csv(paste(path, 'count', 'csv', sep='.'))\n",
    "metadata <- read.csv(paste(path, 'metadata', 'csv', sep='.'))\n",
    "sce_10x <- SingleCellExperiment(assays = list(counts = as.matrix(counts)),\n",
    "                            colData = as.data.frame(metadata))\n",
    "sce_10x$label <- sce_10x$cell_line_demuxlet\n",
    "saveRDS(sce_10x, 'adenocarcinoma/sce/sc_10x.rds')\n",
    "name = 'sc_10x'\n",
    "write.csv(as.matrix(counts(sce_10x)), paste(name, 'counts.csv', sep='.'))\n",
    "write.csv(colData(sce_10x), paste(name, 'metadata.csv', sep='.'))\n",
    "write.csv(rowData(sce_10x), paste(name, 'featuredata.csv', sep='.'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "tissue = 'sc_10x_5cl'\n",
    "path <- paste('adenocarcinoma', tissue, sep='/')\n",
    "counts <- read.csv(paste(path, 'count', 'csv', sep='.'))\n",
    "metadata <- read.csv(paste(path, 'metadata', 'csv', sep='.'))\n",
    "sce_10x_5cl <- SingleCellExperiment(assays = list(counts = as.matrix(counts)),\n",
    "                            colData = as.data.frame(metadata))\n",
    "sce_10x_5cl$label <- sce_10x$cell_line_demuxlet\n",
    "saveRDS(sce_10x, 'adenocarcinoma/sce/sc_10x_5cl.rds')\n",
    "name = 'sc_10x_5cl'\n",
    "write.csv(as.matrix(counts(sce_10x)), paste(name, 'counts.csv', sep='.'))\n",
    "write.csv(colData(sce_10x), paste(name, 'metadata.csv', sep='.'))\n",
    "write.csv(rowData(sce_10x), paste(name, 'featuredata.csv', sep='.'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratio <- function(df) {\n",
    "    table(df$label) / dim(df)[2]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\n",
       "    H1975     H2228    HCC827 \n",
       "0.4087591 0.2956204 0.2956204 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "274"
      ],
      "text/latex": [
       "274"
      ],
      "text/markdown": [
       "274"
      ],
      "text/plain": [
       "[1] 274"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\n",
       "     A549     H1975     H2228      H838    HCC827 \n",
       "0.3586592 0.1430168 0.1474860 0.2178771 0.1329609 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "895"
      ],
      "text/latex": [
       "895"
      ],
      "text/markdown": [
       "895"
      ],
      "text/plain": [
       "[1] 895"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ratio(sce_celseq2)\n",
    "dim(sce_celseq2)[2]\n",
    "ratio(sce_celseq2_5cl)\n",
    "dim(sce_celseq2_5cl)[2]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\n",
       "    H1975     H2228    HCC827 \n",
       "0.3470067 0.3481153 0.3048780 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "902"
      ],
      "text/latex": [
       "902"
      ],
      "text/markdown": [
       "902"
      ],
      "text/plain": [
       "[1] 902"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\n",
       "     A549     H1975     H2228      H838    HCC827 \n",
       "0.3205717 0.1123022 0.1934661 0.2235835 0.1500766 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "3918"
      ],
      "text/latex": [
       "3918"
      ],
      "text/markdown": [
       "3918"
      ],
      "text/plain": [
       "[1] 3918"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ratio(sce_10x)\n",
    "dim(sce_10x)[2]\n",
    "ratio(sce_10x_5cl)\n",
    "dim(sce_10x_5cl)[2]\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "R",
   "language": "R",
   "name": "ir"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
